Try using a Targetted Gene Correlation Network to identify the LFY1 and LFY2 networks
Install…
BiocManager::install(c("AnnotationDBi", "GO.db", "preprocessCore", "impute", "rrvgo", "ComplexHeatmap"))
BiocManager::install(c("sva", "GOSim"))
install.packages("MLmetrics")
remotes::install_local("~/Downloads/GOSim_1.40.0.tgz") # get download link from https://bioconductor.org/packages/3.18/bioc/html/GOSim.html
remotes::install_github('juanbot/CoExpNets')
# remotes::install_github("aliciagp/TGCN") # don't use this, use mine, see below!
use my modified version of TGCN
# remotes::install_local("~/git/TGCN/", force = TRUE)
# or from web:
remotes::install_github("https://github.com/jnmaloof/TGCN", ref = "dev")
library(TGCN)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4 ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggpubr)
library(gridExtra)
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
library(gplots)
Attaching package: ‘gplots’
The following object is masked from ‘package:stats’:
lowess
library(patchwork)
sample.description <- read.csv("../output/sample.description.gametophyte.all.csv")
load read counts. These should be log2 cpm or Voom transformed or something similar.
lcpm <- read.csv("../output/gam_combined_log2cpm.csv.gz", row.names = 1, check.names = FALSE)
rownames(lcpm) <- str_remove(rownames(lcpm), fixed(".v2"))
head(lcpm)
dim(lcpm)
[1] 27206 71
Get the GO info and convert to a list.
GOs <- read_delim("../../McConnell_Samples/input/Crichardii_676_v2.1.annotation_info.txt")
Rows: 75253 Columns: 16── Column specification ────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (16): #pacId, locusName, transcriptName, peptideName, Pfam, Panther, ec, KOG, KO...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
GOs <- GOs[match(rownames(lcpm), GOs$transcriptName),] %>% drop_na(transcriptName)
gene2GO.list <- GOs %>%
select(transcriptName, GO) %>%
rowwise() %>%
mutate(GOlist = str_split(GO, pattern = " ")) %>%
ungroup() %>%
mutate(GOlist = set_names(GOlist, transcriptName)) %>%
pull(GOlist)
Looking at the example notebook on the github site, it looks like genes in rows, samples in columns
My genes of interest.
CrLFY1 <- "Ceric.33G031700"
CrLFY2 <- "Ceric.18G076300"
Subset data to build LFY1 and LFY2 networks. Want to find networks around expression of LFY1 or LFY2, so pull those out and also create matrices that don’t have them.
LFY1.expression <- lcpm[str_detect(rownames(lcpm), CrLFY1),] %>%
unlist() %>%
as.vector()
input_for_LFY1 <- lcpm[str_detect(rownames(lcpm), CrLFY1, negate = TRUE),]
LFY2.expression <- lcpm[str_detect(rownames(lcpm), CrLFY2),] %>%
unlist() %>%
as.vector()
input_for_LFY2 <- lcpm[str_detect(rownames(lcpm), CrLFY2, negate = TRUE),]
path <- "../output/TGCN_LFY1_gam"
if(dir.exists(path)) system(str_c("rm -r ", path))
if(!dir.exists(path)) dir.create(path)
r.lfy1 <- testAllCutoffs(exprData=input_for_LFY1,
target=LFY1.expression,
covs=NULL,
train.split=0.7,
nfolds=5,
t=10,
path=path,
targetName="LFY1",
tissueName="gam",
seed=3333,
cutoffs=10:1,
n=100,
m=10,
s=10,
minCor=0.3,
maxTol=3,
save=T,
overwrite=T,
approach="enrichment", # the approach selected to complete the seed modules
report=F, # if report=T, an automated report will be created
gene2GO=gene2GO.list,
cellTypeAnnotation=FALSE)
- Step 1: Create a linear regression model for each gene set of hubs based on their ratio of appearance
Apply LASSO 10 times for feature selection
Warning: No enrichment can pe performed - there are no feasible GO terms!Warning: No enrichment can pe performed - there are no feasible GO terms!Warning: No enrichment can pe performed - there are no feasible GO terms!
Number of hubs per ratio of appearance
cutoff10 cutoff9 cutoff8 cutoff7 cutoff6 cutoff5 cutoff4 cutoff3 cutoff2
1 2 2 3 8 16 20 38 75
cutoff1
198
Cutoffs selected are 9 8 7 6 5 4 3 2 1
Get final model and the cv error for cutoff 9
Get final model and the cv error for cutoff 8
Get final model and the cv error for cutoff 7
Get final model and the cv error for cutoff 6
Get final model and the cv error for cutoff 5
Get final model and the cv error for cutoff 4
Get final model and the cv error for cutoff 3
Get final model and the cv error for cutoff 2
Get final model and the cv error for cutoff 1
- Step 2: represent train and test error per ratio of appearance model
- Step 3: creating a network for each ratio of appearance where the number of hubs is <=30
Step 3.1: TGCN creation for ratio of appearance 9
Step 3.2: TGCN characterization for cutoff 9
Warning: No enrichment can pe performed - there are no feasible GO terms!
Step 3.1: TGCN creation for ratio of appearance 8
Step 3.2: TGCN characterization for cutoff 8
Step 3.1: TGCN creation for ratio of appearance 7
Step 3.2: TGCN characterization for cutoff 7
Step 3.1: TGCN creation for ratio of appearance 6
Step 3.2: TGCN characterization for cutoff 6
Step 3.1: TGCN creation for ratio of appearance 5
Step 3.2: TGCN characterization for cutoff 5
Step 3.1: TGCN creation for ratio of appearance 4
Step 3.2: TGCN characterization for cutoff 4
r.lfy1$selectRatio$nHubs + r.lfy1$selectRatio$stats
Focus on 6
p <- lapply(r.lfy1$nets, function(cutoff) cutoff$GOenrich$plotStats)
p$c6 + theme(text=element_text(size=10))
r.lfy1$nets$c6$net$moduleSizeSelectionPlot
Correlation with trait, I assume
r.lfy1$nets$c5$net$plotCorr
r.lfy1$nets$c6$net$plotCorr
DT::datatable(r.lfy1$nets$c6$net$modules)
knitr::include_graphics("../output/TGCN_LFY1/results/LFY1_ALL_c6_TGCN_crossTabPlot.png")
grid.arrange(r.lfy1$nets$c6$GOenrich$plotStats, r.lfy1$nets$c6$GOenrich$plotNterms, nrow=2)
path <- "../output/TGCN_LFY2_gam"
if(dir.exists(path)) system(str_c("rm -r ", path))
if(!dir.exists(path)) dir.create(path)
r.lfy2 <- testAllCutoffs(exprData=input_for_LFY2,
target=LFY2.expression,
covs=NULL,
train.split=0.7,
nfolds=5,
t=10,
path=path,
targetName="LFY2",
tissueName="gam",
seed=3333,
cutoffs=10:1,
n=100,
m=10,
s=10,
minCor=0.3,
maxTol=3,
save=T,
overwrite=T,
approach="enrichment", # the approach selected to complete the seed modules
report=F, # if report=T, an automated report will be created
gene2GO=gene2GO.list,
cellTypeAnnotation=FALSE)
- Step 1: Create a linear regression model for each gene set of hubs based on their ratio of appearance
Apply LASSO 10 times for feature selection
Number of hubs per ratio of appearance
cutoff10 cutoff9 cutoff8 cutoff7 cutoff6 cutoff5 cutoff4 cutoff3 cutoff2
0 0 0 3 5 11 20 39 73
cutoff1
240
Cutoffs selected are 7 6 5 4 3 2 1
Get final model and the cv error for cutoff 7
Get final model and the cv error for cutoff 6
Get final model and the cv error for cutoff 5
Get final model and the cv error for cutoff 4
Get final model and the cv error for cutoff 3
Get final model and the cv error for cutoff 2
Get final model and the cv error for cutoff 1
- Step 2: represent train and test error per ratio of appearance model
- Step 3: creating a network for each ratio of appearance where the number of hubs is <=30
Step 3.1: TGCN creation for ratio of appearance 7
Step 3.2: TGCN characterization for cutoff 7
Step 3.1: TGCN creation for ratio of appearance 6
Step 3.2: TGCN characterization for cutoff 6
Step 3.1: TGCN creation for ratio of appearance 5
Step 3.2: TGCN characterization for cutoff 5
Step 3.1: TGCN creation for ratio of appearance 4
Step 3.2: TGCN characterization for cutoff 4
r.lfy2$selectRatio$nHubs + r.lfy2$selectRatio$stats
Focus on 5
p <- lapply(r.lfy2$nets, function(cutoff) cutoff$GOenrich$plotStats)
ggarrange(p$c5 + theme(text=element_text(size=10)),
p$c6 + theme(text=element_text(size=10)),
ncol=2, nrow=1, common.legend=T, legend="bottom")
r.lfy2$nets$c5$net$moduleSizeSelectionPlot
Correlation with trait, I assume
r.lfy2$nets$c5$net$plotCorr
DT::datatable(r.lfy2$nets$c5$net$modules)
knitr::include_graphics("../output/TGCN_LFY2/results/LFY2_ALL_c5_TGCN_crossTabPlot.png")
grid.arrange(r.lfy2$nets$c5$GOenrich$plotStats, r.lfy2$nets$c5$GOenrich$plotNterms, nrow=2)
LFY1.c5 <- read_csv("../output/TGCN_LFY1/results/LFY1_ALL_c5_TGCN.csv")
Rows: 320 Columns: 3── Column specification ────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): hubGene, genes
dbl (1): cor
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
LFY1.c5
All genes
rbind(LFY1=LFY1.expression,lcpm[unique(LFY1.c5$genes),]) %>%
as.matrix() %>%
heatmap.2(trace="none", cexRow= 0.6, cexCol=0.7, col="bluered", scale="row")
Error in hclustfun(distr) : NA/NaN/Inf in foreign function call (arg 10)
hubgenes
All genes
hubgenes
All genes
hubgenes